Explore called duplications

Author

Claudia Zirión-Martínez

Published

February 13, 2025

Index

Setup

Libraries

Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)

Paths

Code
metadata_path <- "data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
chromosomes_path <- "../Crypto_Desjardins_Ashton/results_clean/02.Dataset/chromosomes.csv"
chrom_lengths_path <- "data/processed/chromosome_lengths.tsv"
depth_by_chrom_good_desj_path <- "../Crypto_Desjardins/results_clean/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
depth_by_chrom_good_ashton_path <- "../Crypto_Ashton/results_clean/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
cnv_calls_path <- "../Crypto_Desjardins_Ashton/results_clean/02.Dataset/cnv/cnv_calls.tsv"
duplications_path <- "results/tables/duplications_putative.tsv"

repeats_path_prefix <- "../Crypto_Desjardins/results_clean/03.References/"

Prepare dataset

Metadata

Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99 .
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.

Code
metadata <- read.delim(
    metadata_path,
    header=TRUE,
    sep=",",
    stringsAsFactor = TRUE)
metadata <- metadata %>%
    select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
    filter(!strain == "H99") %>%
    group_by(dataset, lineage)%>%
    mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
    ungroup() %>%
    group_by(lineage)%>%
    mutate(samples_in_lineage = n_distinct(sample))%>%
    ungroup()%>%
    mutate(total_samples = n_distinct(sample))%>%
    droplevels()

Chromosome names and length

Get the nice chromosome names from the chromosomes file in WeavePop.

Code
chromosome_names = read.delim(
    chromosomes_path,
    header=TRUE, sep=",")
chromosome_names <- chromosome_names %>%
    mutate(chromosome = str_pad(chromosome, 2, pad = "0"))%>%
    mutate(chromosome = as.factor(chromosome))
levels(chromosome_names$chromosome) <- paste("chr", chromosome_names$chromosome, sep="")

Get the chromosome lengths. Create the file with bash.

Code
# /usr/bin/bash
tail -n +2 ../Crypto_Desjardins/config/chromosomes.csv | \
    cut -d',' -f1 | sort | uniq | while read line 
do
seqkit fx2tab ../Crypto_Desjardins/data/references/$line.fasta -l -i -n >> data/processed/chromosome_lengths.tsv
done
Code
chromosome_lengths = read.delim(
    chrom_lengths_path,
    header=FALSE, 
    col.names=c("accession", "length"), 
    sep="\t")

Get the chromosome median depth

Code
depth_by_chrom_good_desjardins <- read.delim(
    depth_by_chrom_good_desj_path,
     header=TRUE, sep="\t")
depth_by_chrom_good_ashton <- read.delim(
    depth_by_chrom_good_ashton_path,
     header=TRUE, sep="\t")
depth_by_chrom_good <- rbind(depth_by_chrom_good_desjardins, depth_by_chrom_good_ashton)
depth_by_chrom <- depth_by_chrom_good %>%
    select(sample, accession, norm_chrom_median, norm_chrom_mean)

Get metrics of duplications from called CNVs

Import the file with all called CNVs and keep only the duplications.

Code
cnv_calls <- read.delim(
    cnv_calls_path, 
    header=TRUE, sep="\t")
dup_calls <- filter(cnv_calls, cnv == "duplication") %>%
    left_join(chromosome_names, by="accession")
head(dup_calls)
accession start end cnv region_size depth norm_depth smooth_depth repeat_fraction overlap_bp feature_id sample lineage chromosome
chr02_Bt22 6001 10000 duplication 4000 491.62 8.62 2.90 0.21 832 SRS881178 VNBI chr02
chr05_Bt22 463501 475000 duplication 11500 282.04 4.95 4.95 0.12 1401 CNAG_00127,CNAG_00128,CNAG_01369,CNAG_07313 SRS881178 VNBI chr05
chr07_Bt22 606501 611000 duplication 4500 150.03 2.63 2.47 0.80 3622 CNAG_07666 SRS881178 VNBI chr07
chr12_Bt22 764001 775000 duplication 11000 95.71 1.68 1.67 0.00 0 CNAG_06985,CNAG_06986,CNAG_07011,CNAG_07918 SRS881178 VNBI chr12
chr07_Bt22 1423501 1427500 duplication 4000 172.81 2.58 1.64 0.86 3429 SRS885855 VNBI chr07
chr05_Bt22 464001 474500 duplication 10500 116.48 2.53 2.52 0.09 901 CNAG_00127,CNAG_00128,CNAG_01369,CNAG_07313 SRS885893 VNBI chr05

The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of the CNV region.

Get the percentage of each chromosome covered by repeats to know how much of a chromosome might not have reliable CNV calls.

Explore repeats

Code
lineages <- unique(metadata$lineage)
repeats_all <- data.frame()
for(lineage in lineages){
    repeats_path <-paste(
        repeats_path_prefix,
        lineage, "/", lineage, "_repeats.bed",
        sep ="")
    repeats <- read.delim(repeats_path, 
        header=FALSE, 
        col.names=c("accession", "start", "end", "repeat_type"), 
        sep="\t")
    repeats$lineage <- lineage
    repeats_all <- rbind(repeats_all, repeats)
}
Code
repeats_percent <- repeats_all %>%
    mutate(repeat_size_each = end - start)%>%
    group_by(accession, lineage) %>%
    summarise(repeat_size = sum(repeat_size_each)) %>%
    left_join(chromosome_lengths, by="accession") %>%
    left_join(chromosome_names, by=c("accession","lineage")) %>%
    mutate(percent_repeat_size = round((repeat_size / length) * 100, 2))%>%
    mutate(chromosome = as.factor(chromosome))%>%
    select(lineage, accession, chromosome, percent_repeat_size)

Most chromosomes have repeats in between 5 and 10% of the size. In VNBII it is closer to 15% for some chormosomes.

Explore called duplications

Code
ggplot(dup_calls, aes(x = repeat_fraction, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        facet_wrap(~chromosome, ncol = 2)+
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Repetitive Fraction vs. Depth",
            y = "Normalized Depth",
            x = "Fraction of CNV Overlaping with Repetitive Sequences")

Code
ggplot(dup_calls, aes(x = region_size, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        facet_wrap(~chromosome, ncol = 2)+
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Size of CNV vs. Depth",
            y = "Normalized Depth",
            x = "Size of CNV")

Code
ggplot(dup_calls, aes(x = start, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        facet_wrap(~chromosome, ncol = 2, scales = "free_x")+
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Depth Along Chromosomes",
            y = "Normalized Depth",
            x = "Position")

There is a duplication in many VNI samples in Chr02 with depth from 10 to 70X with a similar size and position, and with more than half of the CNV overlapping with repetitive sequences.

Filter out duplication of Chr2

Since that group in chr02 is the only with both depth higher than 10 and repeat fraction higher than 0.5, we use those thresholds to remove that group of CNVs.

Code
dup_calls_filtered <- dup_calls %>%
    filter(!(norm_depth > 10 & repeat_fraction > 0.5))
Code
ggplot(dup_calls_filtered, aes(x = repeat_fraction, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        facet_wrap(~chromosome, ncol = 2)+
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Repetitive Fraction vs. Depth",
            y = "Normalized Depth",
            x = "Fraction of CNV Overlaping with Repetitive Sequences")

Code
ggplot(dup_calls_filtered, aes(x = region_size, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        facet_wrap(~chromosome, ncol = 2)+
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Size of CNV vs. Depth",
            y = "Normalized Depth",
            x = "Size of CNV")

Code
ggplot(dup_calls_filtered, aes(x = start, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        facet_wrap(~chromosome, ncol = 2, scales = "free")+
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Depth Along Chromosomes",
            y = "Normalized Depth",
            x = "Position")

There is duplication in many VNI samples in Chr01 with a depth of up to 10X, it is not highly repetitive and it is in the same position.

Provisional filtration of chr01 VNI duplication

Explore chr01 to know how to filter that out.

Code
chr1_dup <- dup_calls_filtered %>%
    filter(chromosome == "chr01")
Code
ggplot(chr1_dup, aes(x = start, y = sample, color = norm_depth)) +
    geom_point() +
    scale_x_continuous(labels = scales::comma, breaks = seq(0, max(chr1_dup$start), by = 100000)) +
    theme_classic() +
    theme(legend.position = "bottom",
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.text.x = element_text(angle = 45, hjust = 1)) +
    labs(title = "CNVs along Chr01 in all samples colored by depth",
        y = "Sample",
        x = "Start Position")

Make groups of CNVs that start at the same position in the same chromosome.

Code
dup_summary <- dup_calls_filtered %>%
    group_by(start, accession, lineage) %>%
    summarize(median_depth = mean(norm_depth),
              n = n(),
              chromosome = first(chromosome),
              size = mean(region_size))

Print groups in chr01 with median higher than 4.

Code
dup_summary %>%
    filter(chromosome == "chr01", median_depth > 4)
start accession lineage median_depth n chromosome size
1 CP003820.1 VNI 7.885113 133 chr01 2101.504
339001 CP003820.1 VNI 10.320000 1 chr01 12500.000
339501 CP003820.1 VNI 5.839743 506 chr01 10930.830

Print groups in chr01 that start between the positions 337000 and 340000.

Code
dup_summary %>%
    filter(chromosome == "chr01", start > 337000, start < 340000)
start accession lineage median_depth n chromosome size
339001 CP003820.1 VNI 10.320000 1 chr01 12500.00
339501 CP003820.1 VNI 5.839743 506 chr01 10930.83

Filter out the CNVs in chr01 that start between the positions 337000 and 340000.

Code
dup_calls_filtered <- dup_calls_filtered %>%
    filter(!(chromosome == "chr01" & start > 337000 & start < 340000))
Code
ggplot(dup_calls_filtered, aes(x = repeat_fraction, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        facet_wrap(~chromosome, ncol = 2)+
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Repetitive Fraction vs. Depth",
            y = "Normalized Depth",
            x = "Fraction of CNV Overlaping with Repetitive Sequences")

Code
ggplot(dup_calls_filtered, aes(x = region_size, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        facet_wrap(~chromosome, ncol = 2)+
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Size of CNV vs. Depth",
            y = "Normalized Depth",
            x = "Size of CNV")

Code
ggplot(dup_calls_filtered, aes(x = start, y = norm_depth)) +
        geom_point(aes(color = lineage)) +
        facet_wrap(~chromosome, ncol = 2, scales = "free")+
        scale_x_continuous(labels = scales::comma) +
        theme_minimal() +
        theme(legend.position = "bottom")+
        labs(title = "Depth Along Chromosomes",
            y = "Normalized Depth",
            x = "Position")

Join all data and metadata

The dataframe all_metrics contains a row per individual called duplication with the information about the CNV and the chromosome of the sample. For the chromosome-sample without any called duplication, the columns related to CNVs are NAs.

Code
all_metrics <- left_join(depth_by_chrom, dup_calls_filtered, 
        by = c("sample", "accession"))%>%
    select(-chromosome, -lineage)%>%
    left_join(chromosome_lengths, by = "accession") %>%
    left_join(chromosome_names,by ="accession") %>%
    left_join(metadata, by = c("sample", "lineage"))
head(all_metrics)
sample accession norm_chrom_median norm_chrom_mean start end cnv region_size depth norm_depth smooth_depth repeat_fraction overlap_bp feature_id length lineage chromosome strain source dataset vni_subdivision samples_in_dataset_lineage samples_in_lineage total_samples
SRS404518 CP003820.1 0.98 0.97 NA NA NA NA NA NA NA NA NA NA 2291499 VNI chr01 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003821.1 0.98 1.12 NA NA NA NA NA NA NA NA NA NA 1621675 VNI chr02 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003822.1 0.99 1.01 1533001 1537000 duplication 4000 1246.32 11.64 5.32 0.08 312 CNAG_06925,CNAG_06926 1575141 VNI chr03 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003823.1 1.00 0.99 NA NA NA NA NA NA NA NA NA NA 1084805 VNI chr04 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003824.1 0.99 0.98 NA NA NA NA NA NA NA NA NA NA 1814975 VNI chr05 Bt134 Clinical Desjardins VNIa-5 185 853 1055
SRS404518 CP003825.1 1.00 0.99 NA NA NA NA NA NA NA NA NA NA 1422463 VNI chr06 Bt134 Clinical Desjardins VNIa-5 185 853 1055

Get metrics per chromosome

Summarize the metrics of the called CNVs per chromosome. And keep the rest of the chromosome metrics.

Code
chrom_metrics <- all_metrics %>%
    group_by(dataset,lineage,
            samples_in_lineage, samples_in_dataset_lineage, 
            total_samples,sample, 
            strain,source,
            accession,chromosome, length, norm_chrom_mean, norm_chrom_median) %>%
    summarise(total_cnv_size = sum(region_size),
                n_cnvs = n(),
                first = min(start),
                last = max(end),
                mean_cnv_depth = round(mean(norm_depth),2),
                median_cnv_depth = round(median(norm_depth),2),
                repeat_size = sum(overlap_bp)) %>%
            ungroup()%>%
    mutate(dup_coverage_percent = round((total_cnv_size / length) * 100, 2),
            dup_span_size = last - first,
            dup_span_percent = round((dup_span_size / length) * 100, 2),
            dup_repeat_percent = round((repeat_size / length) * 100, 2),
            chromosome = as.factor(chromosome),
            dup_coverage_percent = ifelse(is.na(dup_coverage_percent), 0, dup_coverage_percent),
            dup_span_percent = ifelse(is.na(dup_span_percent), 0, dup_span_percent))
Code
cat("The total number of chromosomes in all samples in the dataset is:", nrow(chrom_metrics))
The total number of chromosomes in all samples in the dataset is: 14770
Code
cat("The total number of chromosomes in all samples with called duplications is:", nrow(chrom_metrics %>% filter(!is.na(total_cnv_size))))
The total number of chromosomes in all samples with called duplications is: 3865
Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = n_cnvs)) +
        geom_point(aes(color = dup_span_percent)) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "BuPu", direction = 1, name = "Percent Spanned\nby CNVs") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Number of CNVs",
            y = "Number of CNVs in Chromosome",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = n_cnvs)) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = n_cnvs)) +
        facet_wrap(~chromosome, ncol = 3)+
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
        geom_point(aes(color = dup_repeat_percent)) +
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "YlOrRd", direction = 1, trans = "log2", name = "Percent of CNV\nOverlaped by Repeats") +
        theme_minimal() +
        theme(legend.position = "right")+
        labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
            y = "Percent of Chromosome Spanned by CNVs",
            x = "Percent of Chromosome Covered by CNVs")
Warning in scale_color_distiller(palette = "YlOrRd", direction = 1, trans =
"log2", : log-2 transformation introduced infinite values.

Code
ggplot(chrom_metrics)+
    geom_histogram(aes(dup_coverage_percent), binwidth = 1)+
    scale_x_continuous(breaks = seq(0,100, by = 5)) +
    theme_minimal() +
    labs(title = "Histogram of Percent of Chromosome Covered by CNVs",
            y = "Count",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(filter(chrom_metrics, dup_coverage_percent > 0))+
    geom_histogram(aes(dup_coverage_percent), binwidth = 1)+
    scale_x_continuous(breaks = seq(0,100, by = 5)) +
    theme_minimal() +
    labs(title = "Histogram of Percent of Chromosome Covered by CNVs",
            subtitle = "Only values above 0%",
            y = "Count",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(filter(chrom_metrics, dup_coverage_percent > 15))+
    geom_boxplot(aes(x = chromosome, y = dup_coverage_percent))+
    geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent))+
    theme_minimal() +
    labs(title = "Boxplot of Percent of Chromosome Covered by CNVs per Chromosome",
            subtitle = "Only values above 15%",
            y = "Count",
            x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(filter(chrom_metrics, dup_span_percent > 0))+
    geom_histogram(aes(dup_span_percent), binwidth = 1)+
    scale_x_continuous(breaks = seq(0,100, by = 5)) +
    theme_minimal() +
    labs(title = "Histogram of Percent of Chromosome Spanned by CNVs",
            subtitle = "Only values above 0%",
            y = "Count",
            x = "Percent of Chromosome Spanned by CNVs")

Code
ggplot(chrom_metrics)+
    geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
    scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
    theme_minimal() +
    labs(title = "Histogram of Normalized median depth of chromosomes",
            y = "Count",
            x = "Normalized median depth of chromosomes")

Code
ggplot(filter(chrom_metrics, norm_chrom_median > 1))+
    geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
    scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
    theme_minimal() +
    labs(title = "Histogram of Normalized median depth of chromosomes",
            subtitle = "Only values above 1",
            y = "Count",
            x = "Normalized median depth of chromosomes")

Code
ggplot(filter(chrom_metrics, norm_chrom_median > 1.2))+
    geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
    scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
    theme_minimal() +
    labs(title = "Histogram of Normalized median depth of chromosomes",
            subtitle = "Only values above 1.2",
            y = "Count",
            x = "Normalized median depth of chromosomes")

Code
ggplot(chrom_metrics)+
    geom_histogram(aes(median_cnv_depth), binwidth = 0.01)+
    scale_x_continuous(breaks = seq(0, max(chrom_metrics$median_cnv_depth, na.rm = TRUE), by = 0.5)) +
    theme_minimal() +
    labs(title = "Histogram of Median Depth of CNVs",
            y = "Count",
            x = "Median Depth of CNVs")
Warning: Removed 10905 rows containing non-finite outside the scale range
(`stat_bin()`).

Compare metrics of CNVs with chromosome depth

The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of each CNV region.

The Median depth of CNV (median_cnv_depth) is the median of the Normalized depth of all CNVs in the chromosome. There are values bellow the threshold of depth to call a Duplication CNV because that threshold was applied to the smoothed values and this are the raw values.

The Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized).

The Percent of Chromosome Covered by CNVs (dup_coverage_percent) is the percentage of the full length of the chromosome that is part of called CNVs.

The Percent of Chromosome Spanned by CNVs (dup_span_percent) is the percentage of the full length of the chromosome that is in between the leftmost and rightmost CNVs.

Code
color_range <- paste("2.6 -", max(chrom_metrics$median_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median CNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        facet_wrap(~chromosome, ncol = 4)+
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median CNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by CNVs")

Explore putative duplicated chromosomes

Filter by percent of chromosome in CNV or chromosome depth

size_threshold <- 50
depht_threshold <- 1.55
Code
chrom_metrics_filtered <- chrom_metrics %>%
    filter(dup_coverage_percent >= size_threshold | norm_chrom_median >= 1.55)
Code
ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median CNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        facet_wrap(~chromosome, ncol = 3)+

        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        geom_point(aes(color = cut(median_cnv_depth, 
                            breaks = c(-Inf, 1.6, 2.6, Inf), 
                            labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
                            alpha = 0.5) +
        scale_color_manual(values = colors, 
                            name = "Median CNV Depth") +
        theme_minimal() +
        theme(legend.position = "right") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
             y = "Normalized Median Depth of Chromosome",
             x = "Percent of Chromosome Covered by CNVs")

Code
ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_point(aes(color = n_cnvs)) +
        facet_wrap(~chromosome, ncol = 3)+
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
        theme_minimal() +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
            y = "Normalized Median Depth of Chromosome",
            x = "Percent of Chromosome Covered by CNVs")+
        theme(panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            panel.grid.major.y = element_line(color = "lightgray"),
            panel.grid.minor.y = element_blank(),
            panel.background = element_rect(fill = "white"),
            axis.text.x = element_text(angle = 45, hjust = 1),
            panel.border = element_rect(colour = "lightgray", fill=NA, linewidth = 2),
            legend.position = "right")

Code
ggplot(chrom_metrics_filtered, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
        geom_point(aes(color = dup_span_percent)) +
        facet_wrap(~chromosome, ncol = 3)+
        scale_x_continuous(labels = scales::comma) +
        scale_color_distiller(palette = "BuPu", direction = 1, name = "Percent Spanned\nby CNVs") +
        geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
        labs(title = "Percent of Chromosome Covered by CNVs vs. Depth of Chromosome",
            y = "Normalized Median Depth of Chromosome",
            x = "Percent of Chromosome Covered by CNVs")+
        theme_minimal()+
        theme(panel.grid.major.x = element_blank(),
            panel.grid.minor.x = element_blank(),
            panel.grid.major.y = element_line(color = "lightgray"),
            panel.grid.minor.y = element_blank(),
            panel.background = element_rect(fill = "white"),
            axis.text.x = element_text(angle = 45, hjust = 1),
            panel.border = element_rect(colour = "lightgray", fill=NA, linewidth = 2),
            legend.position = "right")

Code
write_tsv(chrom_metrics_filtered, duplications_path)

Explore sample with very large Median Depth of Chromosome

Code
chrom_metrics_filtered %>%
    filter(norm_chrom_median > 2.5)%>%
    select(sample, strain, chromosome, norm_chrom_median, dup_coverage_percent)
sample strain chromosome norm_chrom_median dup_coverage_percent
ERS542397 14936_1#6 chr04 2.74 64.94